# Required packages
import pandas as pd
import numpy as np
import catboost
# Sklearn
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn import metrics
# Visualisation libraries
## progressbar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima Indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| Blood Pressure | Diastolic blood pressure (mm Hg) |
| Skin Thickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| Diabetes Pedigree Function | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Data = pd.read_csv('pima-indians-diabetes-database/diabetes_STD.csv')
Header('Standardized Dataset:')
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
Standardized Dataset: ==============================================================================
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.639947 | 0.848324 | 0.149641 | 0.907270 | -0.692891 | 0.204013 | 0.468492 | 1.425995 | 1 |
| 1 | -0.844885 | -1.123396 | -0.160546 | 0.530902 | -0.692891 | -0.684422 | -0.365061 | -0.190672 | 0 |
| 2 | 1.233880 | 1.943724 | -0.263941 | -1.288212 | -0.692891 | -1.103255 | 0.604397 | -0.105584 | 1 |
| 3 | -0.844885 | -0.998208 | -0.160546 | 0.154533 | 0.123302 | -0.494043 | -0.920763 | -1.041549 | 0 |
| 4 | -1.141852 | 0.504055 | -1.504687 | 0.907270 | 0.765836 | 1.409746 | 5.484909 | -0.020496 | 1 |
| Number of Instances | Number of Attributes |
|---|---|
| 768 | 9 |
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
Labels_dict = dict(zip([0, 1], ['Non-Diabetic', 'Diabetic']))
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.20, 0.12, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
CatBoost Classifier is based on gradient boosted decision trees. During training, a set of decision trees is built consecutively. Each successive tree is built with reduced loss compared to the previous trees.
n_splits = 5
N = int(1e2)
model = catboost.CatBoostClassifier(iterations= N,
task_type="GPU",
devices='0:1',
max_ctr_complexity=6,
random_seed= 42, od_type='Iter', od_wait=N, verbose=int(N/10), depth=5)
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
#
_ = model.fit(X_train, y_train, eval_set= (X_test, y_test), plot = False, verbose = False)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names= list(Labels_dict.values()), output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()), output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 5), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
| Train Set (CV = 5) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.8683 ± 0.0078 | 0.9343 ± 0.0063 | 0.9001 ± 0.0059 | 350.0000 ± 0.0000 |
| Diabetic | 0.8567 ± 0.0126 | 0.7348 ± 0.0174 | 0.7910 ± 0.0135 | 187.0000 ± 0.0000 |
| accuracy | 0.8648 ± 0.0082 | 0.8648 ± 0.0082 | 0.8648 ± 0.0082 | 0.8648 ± 0.0082 |
| macro avg | 0.8625 ± 0.0091 | 0.8345 ± 0.0101 | 0.8455 ± 0.0097 | 537.0000 ± 0.0000 |
| weighted avg | 0.8643 ± 0.0084 | 0.8648 ± 0.0082 | 0.8621 ± 0.0085 | 537.0000 ± 0.0000 |
| Test Set (CV = 5) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.7839 ± 0.0111 | 0.8493 ± 0.0328 | 0.8148 ± 0.0129 | 150.0000 ± 0.0000 |
| Diabetic | 0.6735 ± 0.0396 | 0.5654 ± 0.0394 | 0.6127 ± 0.0177 | 81.0000 ± 0.0000 |
| accuracy | 0.7498 ± 0.0132 | 0.7498 ± 0.0132 | 0.7498 ± 0.0132 | 0.7498 ± 0.0132 |
| macro avg | 0.7287 ± 0.0189 | 0.7074 ± 0.0119 | 0.7138 ± 0.0120 | 231.0000 ± 0.0000 |
| weighted avg | 0.7452 ± 0.0136 | 0.7498 ± 0.0132 | 0.7440 ± 0.0114 | 231.0000 ± 0.0000 |
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
N = int(1e4)
model = catboost.CatBoostClassifier(iterations= N,
task_type="GPU",
devices='0:1',
max_ctr_complexity=6,
random_seed= 42, od_type='Iter', od_wait=N, verbose=int(N/20), depth=5)
_ = model.fit(X_train, y_train, eval_set= (X_test, y_test), plot = False)
Learning rate set to 0.038557 0: learn: 0.6751238 test: 0.6789489 best: 0.6789489 (0) total: 23.1ms remaining: 1m 55s 250: learn: 0.2658617 test: 0.5204797 best: 0.4907738 (70) total: 5.5s remaining: 1m 44s 500: learn: 0.1792998 test: 0.5552010 best: 0.4907738 (70) total: 11s remaining: 1m 39s 750: learn: 0.1296331 test: 0.5895777 best: 0.4907738 (70) total: 16.9s remaining: 1m 35s 1000: learn: 0.1050362 test: 0.6093619 best: 0.4907738 (70) total: 22.7s remaining: 1m 30s 1250: learn: 0.0867549 test: 0.6315973 best: 0.4907738 (70) total: 28.3s remaining: 1m 24s 1500: learn: 0.0738864 test: 0.6547597 best: 0.4907738 (70) total: 33.9s remaining: 1m 19s 1750: learn: 0.0656700 test: 0.6740223 best: 0.4907738 (70) total: 39.6s remaining: 1m 13s 2000: learn: 0.0578875 test: 0.6907689 best: 0.4907738 (70) total: 45.3s remaining: 1m 7s 2250: learn: 0.0491403 test: 0.7127911 best: 0.4907738 (70) total: 51.1s remaining: 1m 2s 2500: learn: 0.0441805 test: 0.7260640 best: 0.4907738 (70) total: 56.8s remaining: 56.8s 2750: learn: 0.0385221 test: 0.7467867 best: 0.4907738 (70) total: 1m 2s remaining: 51.1s 3000: learn: 0.0337336 test: 0.7597182 best: 0.4907738 (70) total: 1m 8s remaining: 45.4s 3250: learn: 0.0309095 test: 0.7730568 best: 0.4907738 (70) total: 1m 14s remaining: 39.9s 3500: learn: 0.0290824 test: 0.7841801 best: 0.4907738 (70) total: 1m 19s remaining: 34.2s 3750: learn: 0.0267074 test: 0.7991542 best: 0.4907738 (70) total: 1m 25s remaining: 28.4s 4000: learn: 0.0251259 test: 0.8117723 best: 0.4907738 (70) total: 1m 31s remaining: 22.7s 4250: learn: 0.0236417 test: 0.8232092 best: 0.4907738 (70) total: 1m 36s remaining: 17s 4500: learn: 0.0222351 test: 0.8365216 best: 0.4907738 (70) total: 1m 42s remaining: 11.4s 4750: learn: 0.0207371 test: 0.8481484 best: 0.4907738 (70) total: 1m 48s remaining: 5.67s 4999: learn: 0.0194959 test: 0.8583739 best: 0.4907738 (70) total: 1m 53s remaining: 0us bestTest = 0.4907737831 bestIteration = 70 Shrink model to first 71 iterations.
# Train
y_pred = model.predict(X_train)
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()), output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
y_pred = model.predict(X_test)
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()), output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.839196 | 0.954286 | 0.893048 | 350.000000 |
| Diabetic | 0.884892 | 0.657754 | 0.754601 | 187.000000 |
| accuracy | 0.851024 | 0.851024 | 0.851024 | 0.851024 |
| macro avg | 0.862044 | 0.806020 | 0.823825 | 537.000000 |
| weighted avg | 0.855109 | 0.851024 | 0.844837 | 537.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.767442 | 0.880000 | 0.819876 | 150.000000 |
| Diabetic | 0.694915 | 0.506173 | 0.585714 | 81.000000 |
| accuracy | 0.748918 | 0.748918 | 0.748918 | 0.748918 |
| macro avg | 0.731179 | 0.693086 | 0.702795 | 231.000000 |
| weighted avg | 0.742010 | 0.748918 | 0.737767 | 231.000000 |
The best result for each metric calculated on each validation dataset.
display(pd.DataFrame({'Train Set': {'R2 Score': model.score(X_train, y_train)},
'Validation Set': {'R2 Score': model.score(X_test, y_test)}}))
| Train Set | Validation Set | |
|---|---|---|
| R2 Score | 0.851024 | 0.748918 |